Try some clustering
library(tidyverse)
── Attaching core tidyverse packages ──────────── tidyverse 2.0.0 ──
âś” dplyr 1.1.0 âś” readr 2.1.4
âś” forcats 1.0.0 âś” stringr 1.5.0
âś” ggplot2 3.4.1 âś” tibble 3.1.8
âś” lubridate 1.9.2 âś” tidyr 1.3.0
✔ purrr 1.0.1 ── Conflicts ────────────────────────────── tidyverse_conflicts() ──
âś– dplyr::filter() masks stats::filter()
âś– dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(tidymodels)
── Attaching packages ────────────────────────── tidymodels 1.0.0 ──
âś” broom 1.0.3 âś” rsample 1.1.1
âś” dials 1.1.0 âś” tune 1.0.1
âś” infer 1.0.4 âś” workflows 1.1.2
âś” modeldata 1.1.0 âś” workflowsets 1.0.0
âś” parsnip 1.0.3 âś” yardstick 1.1.0
âś” recipes 1.0.4
── Conflicts ───────────────────────────── tidymodels_conflicts() ──
âś– scales::discard() masks purrr::discard()
âś– dplyr::filter() masks stats::filter()
âś– recipes::fixed() masks stringr::fixed()
âś– dplyr::lag() masks stats::lag()
âś– yardstick::spec() masks readr::spec()
âś– recipes::step() masks stats::step()
• Use tidymodels_prefer() to resolve common conflicts.
library(cluster)
library(ggfortify)
Registered S3 method overwritten by 'ggfortify':
method from
autoplot.glmnet parsnip
get leaflength data
leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
mutate(pot=str_pad(pot, width=3, pad="0"),
sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
select(sampleID, genotype, trt, leaf_avg, leaf_avg_std)
Rows: 36 Columns: 6── Column specification ────────────────────────────────────────────
Delimiter: ","
chr (3): soil, genotype, trt
dbl (3): pot, leaf_avg, leaf_avg_std
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
leaflength %>% arrange(sampleID)
get and wrangle metabolite data
met_raw <-read_csv("../input/metabolites_set1.csv")
Rows: 72 Columns: 671── Column specification ────────────────────────────────────────────
Delimiter: ","
chr (6): tissue, soil, genotype, autoclave, time_point, concat...
dbl (665): submission_number, pot, sample_mass mg, xylulose NIST...
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
met <- met_raw %>%
mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
mutate(trt=ifelse(is.na(autoclave), "BLANK", autoclave)) %>%
select(sampleID, genotype, tissue, trt, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
#bring in leaf length
left_join({leaflength %>% select(sampleID, leaf_avg_std)}) %>%
select(sampleID, genotype, tissue, trt, leaf_avg_std, everything()) %>%
#make long
pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amt") %>%
#filter away unnamed
filter(str_detect(metabolite, pattern="^[0-9]+$", negate=TRUE)) %>%
#adjust by sample mass
mutate(met_per_mg=met_amt/sample_mass) %>%
pivot_wider(id_cols = c(sampleID, genotype, trt, leaf_avg_std),
names_from = c(tissue, metabolite),
values_from = starts_with("met_"),
names_sep = "_")
Joining with `by = join_by(sampleID)`
met
split this into two data frames, one normalized by tissue amount and one not.
met_per_mg <- met %>% select(sampleID, genotype, trt, starts_with("met_per_mg")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID, genotype, trt, starts_with("met_amt")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_per_mg.scale <- met_per_mg %>% mutate(across(.cols=-c(genotype, trt), .fns=scale))
met_per_mg.scale.t <- met_per_mg.scale %>% select(-genotype, -trt) %>% t() %>% as.data.frame()
metadata for later plotting
sample_meta <- leaflength %>%
select(sample=sampleID, genotype, trt, leaf_avg, leaf_avg_std) %>%
mutate(sample_merge=make.unique(str_c(genotype, "_", trt)))
sample_meta
met_per_mg.pca <- met_per_mg.scale.t %>% prcomp(scale.=TRUE)
met_per_mg_PCs <- met_per_mg.pca %>% magrittr::extract2("x") %>% as.data.frame()
met_per_mg_PCs %>% ggplot(aes(x=PC1,y=PC2)) +
geom_point()
met_per_mg_PCs %>% ggplot(aes(x=PC2,y=PC3)) +
geom_point()
met_per_mg_PCs %>% ggplot(aes(x=PC3,y=PC4)) +
geom_point()
met_per_mg.mds <- met_per_mg.scale.t %>% dist() %>% cmdscale(x.ret=TRUE)
autoplot(met_per_mg.mds)
met_per_mg.tsne <- met_per_mg.scale.t %>% tsne::tsne()
sigma summary: Min. : 0.320970396293183 |1st Qu. : 0.512682589759317 |Median : 0.573038635973548 |Mean : 0.607180548050454 |3rd Qu. : 0.668422063255215 |Max. : 1.09511513963814 |
Epoch: Iteration #100 error is: 17.2872595160823
Epoch: Iteration #200 error is: 0.977562788493326
Epoch: Iteration #300 error is: 0.906612716558305
Epoch: Iteration #400 error is: 0.899198146968262
Epoch: Iteration #500 error is: 0.892931378266666
Epoch: Iteration #600 error is: 0.884506312848133
Epoch: Iteration #700 error is: 0.884017505672013
Epoch: Iteration #800 error is: 0.882242081478597
Epoch: Iteration #900 error is: 0.879067815552968
Epoch: Iteration #1000 error is: 0.874062779961324
met_per_mg.tsne %>% plot()
kclust <- tibble(k=3:8) %>%
mutate(kclust=map(k, ~kmeans(met_per_mg.scale.t, .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, as_tibble(met_per_mg.mds$points) )
)
Warning: There was 1 warning in `mutate()`.
ℹ In argument: `augmented = map(kclust, augment,
as_tibble(met_per_mg.mds$points))`.
Caused by warning:
! The `x` argument of `as_tibble.matrix()` must have unique column
names if `.name_repair` is omitted as of tibble 2.0.0.
ℹ Using compatibility `.name_repair`.
This warning is displayed once every 8 hours.
Call `lifecycle::last_lifecycle_warnings()` to see where this
warning was generated.
kclust
clusters <-
kclust %>%
unnest(cols = c(tidied))
assignments <-
kclust %>%
unnest(cols = c(augmented))
clusterings <-
kclust %>%
unnest(cols = c(glanced))
RColorBrewer::display.brewer.all(type="qual")
p1 <-
ggplot(assignments, aes(x = V1, y = V2)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k) +
scale_color_brewer(palette="Set1", direction = -1) +
ggtitle("kmeans")
p1
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line() +
geom_point()
pcluste <- tibble(k=3:8) %>%
mutate(pcluste=map(k, ~pam(met_per_mg.scale.t, .x, diss=FALSE)),
tidied = map(pcluste, tidy),
glanced = map(pcluste, glance),
augmented = map(pcluste, augment, as_tibble(met_per_mg.mds$points) )
)
pcluste
clusters <-
pcluste %>%
unnest(cols = c(tidied))
assignments <-
pcluste %>%
unnest(cols = c(augmented))
clusterings <-
pcluste %>%
unnest(cols = c(glanced))
p2 <-
ggplot(assignments, aes(x = V1, y = V2)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k) +
scale_color_brewer(palette="Set1", direction = -1) +
ggtitle("Pam Euclidean")
p2
ggplot(clusterings, aes(k, avg.silhouette.width)) +
geom_line() +
geom_point()
pclustm <- tibble(k=3:8) %>%
mutate(pclustm=map(k, ~pam(met_per_mg.scale.t, .x, diss=FALSE, metric="manhattan")),
tidied = map(pclustm, tidy),
glanced = map(pclustm, glance),
augmented = map(pclustm, augment, as_tibble(met_per_mg.mds$points) )
)
pclustm
clusters <-
pclustm %>%
unnest(cols = c(tidied))
assignments <-
pclustm %>%
unnest(cols = c(augmented))
clusterings <-
pclustm %>%
unnest(cols = c(glanced))
p3 <-
ggplot(assignments, aes(x = V1, y = V2)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k) +
scale_color_brewer(palette="Set1", direction = -1) +
ggtitle("Pam Manhattan")
p3
ggplot(clusterings, aes(k, avg.silhouette.width)) +
geom_line() +
geom_point()
p1
p2
p3
cg <- clusGap(met_per_mg.scale.t, kmeans, K.max=9)
Clustering k = 1,2,..., K.max (= 9): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
plot(cg)
cg
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = met_per_mg.scale.t, FUNcluster = kmeans, K.max = 9)
B=100 simulated reference sets, k = 1..9; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 7
logW E.logW gap SE.sim
[1,] 6.421788 6.918019 0.4962311 0.004606659
[2,] 6.250008 6.870524 0.6205159 0.004473502
[3,] 6.183201 6.847694 0.6644921 0.004486972
[4,] 6.138483 6.828390 0.6899078 0.004671327
[5,] 6.084682 6.813268 0.7285859 0.004559963
[6,] 6.054436 6.800183 0.7457464 0.004613803
[7,] 6.032065 6.788598 0.7565327 0.004643470
[8,] 6.037614 6.778026 0.7404117 0.004370461
[9,] 5.996726 6.768415 0.7716891 0.004303381
cg <- clusGap(met_per_mg.scale.t, pam, K.max=9, diss=FALSE)
Clustering k = 1,2,..., K.max (= 9): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
plot(cg)
cg
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = met_per_mg.scale.t, FUNcluster = pam, K.max = 9, diss = FALSE)
B=100 simulated reference sets, k = 1..9; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 9
logW E.logW gap SE.sim
[1,] 6.421788 6.918418 0.4966293 0.005578790
[2,] 6.270777 6.891267 0.6204893 0.008540835
[3,] 6.206398 6.872026 0.6656280 0.007369309
[4,] 6.146155 6.856340 0.7101848 0.007337648
[5,] 6.101192 6.843736 0.7425440 0.006582442
[6,] 6.075694 6.832732 0.7570376 0.006449575
[7,] 6.046747 6.822472 0.7757246 0.006428820
[8,] 6.026313 6.812510 0.7861972 0.005957189
[9,] 6.009753 6.803857 0.7941044 0.005969037
cg <- clusGap(met_per_mg.scale.t, pam, K.max=9, diss=FALSE, metric="manhattan")
Clustering k = 1,2,..., K.max (= 9): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
plot(cg)
cg
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = met_per_mg.scale.t, FUNcluster = pam, K.max = 9, diss = FALSE, metric = "manhattan")
B=100 simulated reference sets, k = 1..9; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 9
logW E.logW gap SE.sim
[1,] 6.421788 6.918370 0.4965821 0.004975897
[2,] 6.298241 6.896911 0.5986703 0.006074368
[3,] 6.229194 6.880361 0.6511672 0.006923682
[4,] 6.190501 6.866868 0.6763669 0.007333121
[5,] 6.144870 6.855027 0.7101571 0.006572353
[6,] 6.115488 6.845459 0.7299710 0.006443715
[7,] 6.105115 6.835345 0.7302301 0.006252574
[8,] 6.089468 6.826578 0.7371098 0.006362957
[9,] 6.068056 6.818503 0.7504473 0.005834587
plot_clusters <- function(x, meta=sample_meta) {
x %>% pivot_longer(-c(.rownames,.cluster), names_to="sample") %>%
left_join(sample_meta) %>%
ggplot(aes(x=sample_merge, y=value)) +
geom_line(alpha=0.1, aes(group=.rownames)) +
facet_wrap(~.cluster) +
geom_vline(color="red", lwd=.5, xintercept = c(seq(6.5,30.5, by=6))) +
theme(axis.text.x = element_text(size=7, vjust=0.5, hjust=1, angle=90))
}
assignments.pe <- pcluste %>%
mutate(met_per_mg = map(pcluste, augment, met_per_mg.scale.t),
cluster_plot = map(met_per_mg, plot_clusters))
Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`
walk(assignments.pe$cluster_plot, print)
met_amt.scale <- met_amt %>% mutate(across(.cols=-c(genotype, trt), .fns=scale))
met_amt.scale.t <- met_amt.scale %>% select(-genotype, -trt) %>% t() %>% as.data.frame()
met_amt.pca <- met_amt.scale.t %>% prcomp(scale.=TRUE)
met_amt_PCs <- met_amt.pca %>% magrittr::extract2("x") %>% as.data.frame()
met_amt_PCs %>% ggplot(aes(x=PC1,y=PC2)) +
geom_point()
met_amt_PCs %>% ggplot(aes(x=PC2,y=PC3)) +
geom_point()
met_amt_PCs %>% ggplot(aes(x=PC3,y=PC4)) +
geom_point()
met_amt.mds <- met_amt.scale.t %>% dist() %>% cmdscale(x.ret=TRUE)
autoplot(met_amt.mds)
met_amt.tsne <- met_amt.scale.t %>% tsne::tsne()
sigma summary: Min. : 0.434986821697206 |1st Qu. : 0.595665953360781 |Median : 0.676181808785075 |Mean : 0.695106609037197 |3rd Qu. : 0.780506015083736 |Max. : 1.09428067361697 |
Epoch: Iteration #100 error is: 19.2045994327469
Epoch: Iteration #200 error is: 1.18083500114745
Epoch: Iteration #300 error is: 1.06888956683155
Epoch: Iteration #400 error is: 1.02637753810654
Epoch: Iteration #500 error is: 1.01997491422502
Epoch: Iteration #600 error is: 1.01374404913146
Epoch: Iteration #700 error is: 0.997678161350794
Epoch: Iteration #800 error is: 0.993447211926238
Epoch: Iteration #900 error is: 0.990801365094571
Epoch: Iteration #1000 error is: 0.990058796867227
met_amt.tsne %>% plot()
kclust <- tibble(k=3:8) %>%
mutate(kclust=map(k, ~kmeans(met_amt.scale.t, .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, as_tibble(met_amt.mds$points) )
)
kclust
clusters <-
kclust %>%
unnest(cols = c(tidied))
assignments <-
kclust %>%
unnest(cols = c(augmented))
clusterings <-
kclust %>%
unnest(cols = c(glanced))
p1 <-
ggplot(assignments, aes(x = V1, y = V2)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k) +
scale_color_brewer(palette="Set1", direction = -1) +
ggtitle("kmeans")
p1
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line() +
geom_point()
pcluste <- tibble(k=3:8) %>%
mutate(pcluste=map(k, ~pam(met_amt.scale.t, .x, diss=FALSE)),
tidied = map(pcluste, tidy),
glanced = map(pcluste, glance),
augmented = map(pcluste, augment, as_tibble(met_amt.mds$points) )
)
pcluste
clusters <-
pcluste %>%
unnest(cols = c(tidied))
assignments <-
pcluste %>%
unnest(cols = c(augmented))
clusterings <-
pcluste %>%
unnest(cols = c(glanced))
p2 <-
ggplot(assignments, aes(x = V1, y = V2)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k) +
scale_color_brewer(palette="Set1", direction = -1) +
ggtitle("Pam Euclidean")
p2
ggplot(clusterings, aes(k, avg.silhouette.width)) +
geom_line() +
geom_point()
pclustm <- tibble(k=3:8) %>%
mutate(pclustm=map(k, ~pam(met_amt.scale.t, .x, diss=FALSE, metric="manhattan")),
tidied = map(pclustm, tidy),
glanced = map(pclustm, glance),
augmented = map(pclustm, augment, as_tibble(met_amt.mds$points) )
)
pclustm
clusters <-
pclustm %>%
unnest(cols = c(tidied))
assignments <-
pclustm %>%
unnest(cols = c(augmented))
clusterings <-
pclustm %>%
unnest(cols = c(glanced))
p3 <-
ggplot(assignments, aes(x = V1, y = V2)) +
geom_point(aes(color = .cluster)) +
facet_wrap(~ k) +
scale_color_brewer(palette="Set1", direction = -1) +
ggtitle("Pam Manhattan")
p3
ggplot(clusterings, aes(k, avg.silhouette.width)) +
geom_line() +
geom_point()
p1
p2
p3
cg <- clusGap(met_amt.scale.t, kmeans, K.max=9)
Clustering k = 1,2,..., K.max (= 9): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
plot(cg)
cg
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = met_amt.scale.t, FUNcluster = kmeans, K.max = 9)
B=100 simulated reference sets, k = 1..9; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 8
logW E.logW gap SE.sim
[1,] 6.563355 6.991556 0.4282007 0.004963666
[2,] 6.482151 6.945124 0.4629731 0.004996273
[3,] 6.442064 6.920937 0.4788730 0.004533869
[4,] 6.384038 6.902830 0.5187923 0.004753582
[5,] 6.356412 6.888007 0.5315949 0.004691245
[6,] 6.336569 6.875890 0.5393202 0.004727353
[7,] 6.317922 6.864958 0.5470355 0.004913001
[8,] 6.299876 6.854976 0.5550993 0.004413889
[9,] 6.291947 6.845969 0.5540214 0.004762639
cg <- clusGap(met_amt.scale.t, pam, K.max=9, diss=FALSE)
Clustering k = 1,2,..., K.max (= 9): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
plot(cg)
cg
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = met_amt.scale.t, FUNcluster = pam, K.max = 9, diss = FALSE)
B=100 simulated reference sets, k = 1..9; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 8
logW E.logW gap SE.sim
[1,] 6.563355 6.991859 0.4285039 0.005297143
[2,] 6.484182 6.966839 0.4826577 0.008211047
[3,] 6.432616 6.948417 0.5158004 0.007966139
[4,] 6.390202 6.933598 0.5433962 0.007710088
[5,] 6.363875 6.920938 0.5570628 0.007291380
[6,] 6.343806 6.910253 0.5664470 0.006931946
[7,] 6.325227 6.900347 0.5751196 0.006931358
[8,] 6.308374 6.891399 0.5830251 0.006742401
[9,] 6.294289 6.882644 0.5883554 0.006017687
cg <- clusGap(met_amt.scale.t, pam, K.max=9, diss=FALSE, metric="manhattan")
Clustering k = 1,2,..., K.max (= 9): .. done
Bootstrapping, b = 1,2,..., B (= 100) [one "." per sample]:
.................................................. 50
.................................................. 100
plot(cg)
cg
Clustering Gap statistic ["clusGap"] from call:
clusGap(x = met_amt.scale.t, FUNcluster = pam, K.max = 9, diss = FALSE, metric = "manhattan")
B=100 simulated reference sets, k = 1..9; spaceH0="scaledPCA"
--> Number of clusters (method 'firstSEmax', SE.factor=1): 9
logW E.logW gap SE.sim
[1,] 6.563355 6.992315 0.4289597 0.004710693
[2,] 6.493257 6.968083 0.4748262 0.007267814
[3,] 6.462072 6.951978 0.4899064 0.007574299
[4,] 6.407769 6.938640 0.5308710 0.006184092
[5,] 6.374247 6.927462 0.5532151 0.006475408
[6,] 6.355711 6.917056 0.5613451 0.006445086
[7,] 6.337981 6.907379 0.5693977 0.006107336
[8,] 6.320155 6.898844 0.5786883 0.006034473
[9,] 6.306475 6.891302 0.5848267 0.005913412
plot_clusters <- function(x, meta=sample_meta) {
x %>% pivot_longer(-c(.rownames,.cluster), names_to="sample") %>%
left_join(sample_meta) %>%
ggplot(aes(x=sample_merge, y=value)) +
geom_line(alpha=0.1, aes(group=.rownames)) +
facet_wrap(~.cluster) +
geom_vline(color="red", lwd=.5, xintercept = c(seq(6.5,30.5, by=6))) +
theme(axis.text.x = element_text(size=7, vjust=0.5, hjust=1, angle=90))
}
assignments.pe <- pcluste %>%
mutate(met_amt = map(pcluste, augment, met_amt.scale.t),
cluster_plot = map(met_amt, plot_clusters))
Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`Joining with `by = join_by(sample)`
walk(assignments.pe$cluster_plot, print)